# import the required libraries 
import numpy as np
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest
import pingouin
import seaborn as sns
import matplotlib.pyplot as plt

# load the dataset
drug_safety = pd.read_csv('drug_safety.csv', usecols = lambda col: col != 'index')
drug_safety.head()

# Count the adverse_effects column values for each trx group
adv_eff_by_trx = drug_safety.groupby("trx").adverse_effects.value_counts()
adv_eff_by_trx

trx      adverse_effects
Drug     No                 9703
         Yes                1024
Placebo  No                 4864
         Yes                 512
Name: count, dtype: int64

# Count the number of total rows in each group
adv_eff_by_trx_totals = adv_eff_by_trx.groupby("trx").sum()
adv_eff_by_trx_totals

trx
Drug       10727
Placebo     5376
Name: count, dtype: int64

# Create an array of the "Yes" counts for each group
yeses = [adv_eff_by_trx["Drug"]["Yes"], adv_eff_by_trx["Placebo"]["Yes"]]
yeses

[1024, 512]

# Create an array of the total number of rows in each group
n = [adv_eff_by_trx_totals["Drug"], adv_eff_by_trx_totals["Placebo"]]
n

[10727, 5376]

# Perform a two-sided z-test on the two proportions
two_sample_results = proportions_ztest(yeses, n)
two_sample_results

(0.0452182684494942, 0.9639333330262475)

# Extract the p-value
two_sample_p_value = two_sample_results[1]
two_sample_p_value

0.9639333330262475

# Determine if num_effects and trx are independent
num_effects_groups = pingouin.chi2_independence(
    data=drug_safety, x="num_effects", y="trx")

# Extract the p-value
num_effects_p_value = num_effects_groups[2]["pval"][0]

num_effects_p_value

C:\Users\newbe\anaconda3\Lib\site-packages\pingouin\contingency.py:151: UserWarning: Low count on observed frequencies.
  warnings.warn(f"Low count on {name} frequencies.")
C:\Users\newbe\anaconda3\Lib\site-packages\pingouin\contingency.py:151: UserWarning: Low count on expected frequencies.
  warnings.warn(f"Low count on {name} frequencies.")

0.6150123339426765

# Create a histogram
sns.histplot(data=drug_safety, x="age", hue="trx")

C:\Users\newbe\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

<Axes: xlabel='age', ylabel='Count'>

# confirm the histogram's output by conducting a normality test, To choose between unpaired t-test and Wilcoxon-Mann-Whitney test
normality = pingouin.normality(
    data=drug_safety,
    dv='age',
    group='trx',
    method='shapiro', # the default
    alpha=0.05) # 0.05 is also the default

normality

C:\Users\newbe\anaconda3\Lib\site-packages\scipy\stats\_morestats.py:1882: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")
C:\Users\newbe\anaconda3\Lib\site-packages\scipy\stats\_morestats.py:1882: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

# Select the age of the Drug group
age_trx = drug_safety.loc[drug_safety["trx"] == "Drug", "age"]

# Select the age of the Placebo group
age_placebo = drug_safety.loc[drug_safety["trx"] == "Placebo", "age"]


# Since the data distribution is not normal, lets conduct a two-sided Mann-Whitney U test
age_group_effects = pingouin.mwu(age_trx, age_placebo)

age_group_effects

# Extract the p-value
age_group_effects_p_value = age_group_effects["p-val"]
age_group_effects_p_value

MWU    0.256963
Name: p-val, dtype: float64

Column	Description
`sex`	The gender of the individual
`age`	The age of the individual
`week`	The week during which the drug testing took place
`trx`	The treatment group (Drug) or control group (Placebo)
`wbc`	White blood cell count
`rbc`	Red blood cell count
`adverse_effects`	Whether the individual experienced at least one adverse effect
`num_effects`	The total number of adverse effects experienced by the individual

	W	pval	normal
trx
Drug	0.976785	2.189152e-38	False
Placebo	0.975595	2.224950e-29	False

Project Description¶

Determine if the proportion of adverse effects differs significantly between the Drug and Placebo groups¶

Find out if the number of adverse effects is independent of the treatment and control groups¶

Final Summary¶

	age	sex	trx	week	wbc	rbc	adverse_effects
0	62	male	Drug	0	7.3	5.1	No
1	62	male	Drug	1	NaN	NaN	No
2	62	male	Drug	12	5.6	5.0	No
3	62	male	Drug	16	NaN	NaN	No
4	62	male	Drug	2	6.6	5.1	No